Proteomics: data analysis

Prerequisites

Load the packages needed for the analyses and read-in the relavant data

library(scales)
library(data.table)
library(vctrs)
library(tidyverse)
library(plotly)
library(TissueEnrich)
library(ggrepel)
setwd(
  "C:/Users/arun/OneDrive - Iowa State University/OrganizedDocuments/github/mouse.trophoblast.smallRNAseq"
)
dat <-
  read.csv("assets/data_processed.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
orig_data <-
  read.csv("assets/orig_data.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
source("assets/theme_clean.R")

quick check:

dat_tidy <- dat %>% drop_na()
dat %>% summarise(Number_of_proteins = n())
#>   Number_of_proteins
#> 1                174

Volcano plot

df <- orig_data
df$log2fc <- log(df$Ratio, base = 2)
df$negLog10p = -log10(df$Qvalue)
df$diffexpressed[df$log2fc <= -0.584962501 &
                   df$Qvalue <= 0.05] <- "up in Diff"
df$diffexpressed[df$log2fc >= 0.584962501 &
                   df$Qvalue <= 0.05] <- "up in Undiff"
df$diffexpressed[df$log2fc >= -0.584962501 &
                   df$log2fc <= 0.584962501] <- "other genes"
df$diffexpressed[df$Qvalue > 0.05] <- "other genes"
df$newGenes <-
  str_replace_all(string = df$Genes,
                  pattern = "\\;.*$",
                  replacement = "")
data.table::setDT(df)[diffexpressed == "other genes", newGenes := NA]

g <-
  ggplot(data = df,
         aes(
           x = log2fc,
           y = negLog10p,
           col = diffexpressed,
           label = newGenes
         )) +
  geom_point(alpha = 0.5) +
  theme_classic() +
  scale_color_manual(name = "Expression",
                     values = c("grey", "#c6007b", "#a0b600")) +
  ggtitle(paste("Diff vs. Undiff (proteomics)")) +
  #  geom_text_repel(show.legend = F) +
  xlab("Log2 fold change") +
  ylab("-log10 pvalue") +
  theme(legend.text.align = 0)
ggplotly(g)

Figure 1: Volcano plot (proteomics)

#ggsave("prot_volc.png", dpi=900, width = 8, height = 6)

PCA plot

This was run on another R version due to incompatibility with existing packages. The command is shown below, but it is not executed when knitting the document.

library("MetaboAnalystR")
library(ggplot2)
library(ggrepel)

setwd("~/metabo")
mSet <- InitDataObjects("conc", "stat", FALSE)
mSet <- Read.TextData(mSet, "data_processed.csv", "colu", "disc")

mSet <- SanityCheckData(mSet)
mSet <- RemoveMissingPercent(mSet, percent = 0.5)
mSet <- ImputeMissingVar(mSet, method = "min")
mSet <- PreparePrenormData(mSet)
mSet <-
  Normalization(mSet,
                "NULL",
                "LogNorm",
                "NULL",
                ratio = FALSE,
                ratioNum = 20)
mSet <-
  PlotSampleNormSummary(mSet, "snorm_0_", "png", 300, width = NA)
mSet <- PCA.Anal(mSet)
saveRDS(mSet, "mSet.rds")

Read in the saved RDS file from the previous step and plot the PCA for samples.

mSet <- readRDS("assets/mset.rds")
group <- sapply(strsplit(rownames(mSet$analSet$pca$x), "_"), "[", 1)
intgroup.df <- as.data.frame(group)
d <- data.frame(
  PC1 = mSet$analSet$pca$x[, 1],
  PC2 = mSet$analSet$pca$x[, 2],
  intgroup.df,
  name = rownames(mSet$analSet$pca$x)
)

g <- ggplot(d, aes(PC1, PC2, color = group)) +
  scale_shape_manual(values = 1:10) +
  scale_color_manual(values = c('Diff'      = '#c6007b',
                                'Undiff'  = '#a0b600')) +
  theme_classic() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  geom_text_repel(aes(label = name)) +
  xlab(paste("PC1", round(mSet$analSet$pca$variance[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(mSet$analSet$pca$variance[2] * 100, 2), "% variance"))
g
Figure 2: PCA plot (proteomics)

Figure 2: PCA plot (proteomics)

Tissue Enrichment

plotTE <- function(inputGenes = gene.list,
                   myColor = "color") {
  gs <-
    GeneSet(geneIds = inputGenes,
            organism = "Mus Musculus",
            geneIdType = SymbolIdentifier())
  output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
  en.output <-
    setNames(data.frame(assay(output[[1]]),
                        row.names = rowData(output[[1]])[, 1]),
             colData(output[[1]])[, 1])
  en.output$Tissue <- rownames(en.output)
  logp <- -log10(0.05)
  en.output <-
    mutate(en.output,
           significance = ifelse(Log10PValue > logp,
                                 "colored", "nocolor"))
  en.output$Sig <- "NA"
  ggplot(en.output, aes(reorder(Tissue, Log10PValue),
                        Log10PValue,
                        fill = significance)) +
    geom_bar(stat = 'identity') +
    theme_clean() + ylab("- log10 adj. p-value") + xlab("") +
    scale_fill_manual(values = c("colored" = myColor, "nocolor" = "gray")) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)),
                       breaks = scales::pretty_breaks()) +
    coord_flip()
}
exp <- log(1.2, 2)
diff <- df$newGenes[df$log2fc <= -exp & df$Qvalue <= 0.05]
undiff <- df$newGenes[df$log2fc >= exp & df$Qvalue <= 0.05]
df.diff <- dat_tidy %>%
  rowwise() %>%
  mutate(Diff = mean(c(
    Diff_rep1,   Diff_rep2,  Diff_rep3,  Diff_rep4,  Diff_rep5
  ))) %>%
  dplyr::select(proteinID, Diff) %>%
  dplyr::filter(Diff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Diff, 4)) %>%
  mutate(decile = ntile(Diff, 10))
df.undiff <- dat_tidy %>%
  rowwise() %>%
  mutate(Undiff = mean(
    c(
      Undiff_rep1,
      Undiff_rep2,
      Undiff_rep3,
      Undiff_rep4,
      Undiff_rep5
    )
  )) %>%
  dplyr::select(proteinID, Undiff) %>%
  dplyr::filter(Undiff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Undiff, 4)) %>%
  mutate(decile = ntile(Undiff, 10))

filterCuts <- function(dataIn = df.undiff,
                       cutOff = 4,
                       type = decile) {
  type <- enquo(type)
  dataIn %>%
    dplyr::filter(!!type == cutOff) %>%
    dplyr::select(proteinID)
}
undiff.75pc <- filterCuts(df.undiff, 4, type = quart)
diff.75pc <- filterCuts(df.diff, 4, type = quart)
undiff.90pc <- filterCuts(df.undiff, 10, type = decile)
diff.90pc <- filterCuts(df.diff, 10, type = decile)

Enrichment plots (TissueEnrich)

Diff DE

plotTE(unique(diff), myColor = "#c6007b")
Figure 3A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Figure 3A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Undiff DE

plotTE(unique(undiff), myColor = "#a0b600")
Figure 3B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Figure 3B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Diff 75th pc

plotTE(as.character(diff.75pc$proteinID), "#c6007b")
Figure 3C: Top quartile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3C: Top quartile proteins (Diff) enrichment in Mouse ENCODE data

Diff 90th pc

plotTE(as.character(diff.90pc$proteinID), "#c6007b")
Figure 3D: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3D: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Undiff 75th pc

plotTE(as.character(undiff.75pc$proteinID), "#a0b600")
Figure 3E: Top quartile proteins (Undiff) enrichment in Mouse ENCODE data

Figure 3E: Top quartile proteins (Undiff) enrichment in Mouse ENCODE data

Undiff 90th pc

plotTE(as.character(undiff.90pc$proteinID), "#a0b600")
Figure 3F: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3F: Top decile proteins (Diff) enrichment in Mouse ENCODE data